In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as plt
from IPython.display import display_html
In [2]:
df = pd.read_csv('data/train.csv')
df.head(10) # 打印出前 10 条看看样本数据
Out[2]:
In [3]:
df.describe(percentiles=[.1, .25, .5, .75, .9])
Out[3]:
从 describe 可以看出
In [4]:
fig = plt.pyplot.figure(figsize=(30, 4))
ax = fig.add_subplot(131)
ax.hist(df['Age'], bins=10, range=[df['Age'].min(), df['Age'].max()])
ax.set_xlabel('Age')
ax.set_ylabel('Age distribution')
ax = fig.add_subplot(132)
ax.hist(df['Fare'], bins=10, range=(df['Fare'].min(), df['Fare'].max()))
ax.set_xlabel('Fare')
ax.set_ylabel('Fare distribution')
ax = fig.add_subplot(133)
s = df['Fare']
ax.hist(s[s < s.max()], bins=10)
ax.set_xlabel('Fare without max')
ax.set_ylabel('Fare distribution')
plt.pyplot.show()
In [5]:
df.boxplot('Fare', by='Pclass', figsize=(20, 4))
Out[5]:
In [6]:
grouped = df.groupby('Pclass')
fig = plt.pyplot.figure(figsize=(30, 4))
ax = fig.add_subplot(121)
ax.set_title('Pclass count')
ax.set_xlabel('Pclass')
ax.set_ylabel('Count')
grouped.Survived.count().plot(kind='bar')
ax = fig.add_subplot(122)
ax.set_title('Pclass survived')
ax.set_xlabel('Pclass')
ax.set_ylabel('Survived Percentage')
(grouped.Survived.sum() / grouped.Survived.count()).plot(kind='bar')
Out[6]:
In [7]:
df2 = pd.crosstab([df.Pclass, df.Sex], df.Survived.astype(bool))
display_html(df2)
df2.plot(kind='bar', stacked=True, color=['red', 'g'], figsize=(20, 5), fontsize=16)
Out[7]:
In [8]:
from IPython.display import FileLink
FileLink('Titanic baby step for pandas Part 2.ipynb')
Out[8]: